前準備(データ読み込み、ライブラリのインポート)

library("psych")
## Warning: package 'psych' was built under R version 3.5.3
library("skimr")
## Warning: package 'skimr' was built under R version 3.5.3
## 
## Attaching package: 'skimr'
## The following object is masked from 'package:stats':
## 
##     filter
library("plotly")
## Warning: package 'plotly' was built under R version 3.5.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.3
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
bank_marketing_train <- read.csv("../bank_marketing_train.csv")

1.ターゲットのペルソナを検討する

# y=yes/noのデータを抽出
bank_marketing_train_y <- bank_marketing_train[bank_marketing_train$y=="yes",]
bank_marketing_train_n <- bank_marketing_train[bank_marketing_train$y=="no",]


# データ数
num_yes = dim(bank_marketing_train_y)[1]
num_no = dim(bank_marketing_train_n)[1]

ヒストグラム(特徴が表れていそうなもの)

# 年齢(age)
pl_yes <- plot_ly(x = bank_marketing_train_y$age, type="histogram", name = "yes")
pl_no <- plot_ly(x = bank_marketing_train_n$age, type="histogram", name = "no")
subplot(pl_yes, pl_no)
# => yesの方が、60以上が多い

# 職業(job)
pl_yes <- plot_ly(x = bank_marketing_train_y$job, type="histogram", name = "yes")
pl_no <- plot_ly(x = bank_marketing_train_n$job, type="histogram", name = "no")
subplot(pl_yes, pl_no)
# 割合をみてみる
summary(bank_marketing_train_y$job)/num_yes
##        admin.   blue-collar  entrepreneur     housemaid    management 
##   0.290832455   0.138040042   0.027924131   0.023182297   0.069810327 
##       retired self-employed      services       student    technician 
##   0.094309800   0.029504742   0.069810327   0.060590095   0.158324552 
##    unemployed       unknown 
##   0.030031612   0.007639621
summary(bank_marketing_train_n$job)/num_no
##        admin.   blue-collar  entrepreneur     housemaid    management 
##   0.248163483   0.235975691   0.036630159   0.026512622   0.069921197 
##       retired self-employed      services       student    technician 
##   0.035862161   0.033992253   0.099839722   0.016862562   0.164551890 
##    unemployed       unknown 
##   0.023640978   0.008047282
# => yesの方が、retired/studentが多く、blue-colorが少ない。特にstudentは約4倍、retiredは約3倍違いがでている

# 最終学歴(education)
pl_yes <- plot_ly(x = bank_marketing_train_y$education, type="histogram", name = "yes")
pl_no <- plot_ly(x = bank_marketing_train_n$education, type="histogram", name = "no")
subplot(pl_yes, pl_no)
# 割合をみてみる
summary(bank_marketing_train_y$education)/num_yes
##            basic.4y            basic.6y            basic.9y 
##         0.091938883         0.038461538         0.104320337 
##         high.school          illiterate professional.course 
##         0.223129610         0.001053741         0.127239199 
##   university.degree             unknown 
##         0.356691254         0.057165437
summary(bank_marketing_train_n$education)/num_no
##            basic.4y            basic.6y            basic.9y 
##        0.1028115400        0.0558634967        0.1534660077 
##         high.school          illiterate professional.course 
##        0.2331708294        0.0003673033        0.1262855616 
##   university.degree             unknown 
##        0.2877320689        0.0403031922
# => yesはilliterateが多い

# 連絡デバイス(contact)
pl_yes <- plot_ly(x = bank_marketing_train_y$contact, type="histogram", name = "yes")
pl_no <- plot_ly(x = bank_marketing_train_n$contact, type="histogram", name = "no")
subplot(pl_yes, pl_no)
# => yesはcellularが多い

# 以前のキャンペーン結果(campaign)
pl_yes <- plot_ly(x = bank_marketing_train_y$poutcome, type="histogram", name = "yes")
pl_no <- plot_ly(x = bank_marketing_train_n$poutcome, type="histogram", name = "no")
subplot(pl_yes, pl_no)
# 割合をみてみる
summary(bank_marketing_train_y$poutcome)/num_yes
##     failure nonexistent     success 
##   0.1290832   0.6817703   0.1891465
summary(bank_marketing_train_n$poutcome)/num_no
##     failure nonexistent     success 
##  0.09867103  0.88793909  0.01338988
# => yesはsuccessが多い

# 以前のキャンペーンの接触回数(previous)
pl_yes <- plot_ly(x = bank_marketing_train_y$previous, type="histogram", name = "yes")
pl_no <- plot_ly(x = bank_marketing_train_n$previous, type="histogram", name = "no")
subplot(pl_yes, pl_no)
# 割合をみてみる
summary(bank_marketing_train_y$previous)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.4871  1.0000  6.0000
summary(bank_marketing_train_n$previous)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1315  0.0000  7.0000
# => yesは平均値が大きい(yes:0.48, no:0.13)

定性的な仮説

  • 年齢:入社後の22歳ごろと退職後の60歳ごろはyesが増えそう→60歳は合っている
  • 職業:student、unemployedはyesが少なそう→外れている。studentは逆。
  • 婚姻状況:divorced(離婚)はyesが少なそう→外れ。傾向なし
  • クレジットの支払遅延:無しはyesが多そう→外れ。傾向なし
  • 最終学歴:制度が調べられなかった
  • 不動産ローンの有無:無しはyesが多そう→外れ。傾向なし
  • 個人ローンの有無:無しはyesが多そう→外れ。傾向なし
  • 連絡デバイス:関係なさそう→外れ。yesはcellularが多い
  • 前回の接触からの経過日数:短い方がyesが多そう(担当者を覚えている)→外れ。傾向なし
  • 以前のキャンペーン結果:successがyesが多そう(継続してくれるのでは)→当たり
  • 以前のキャンペーンの接触回数:数が多い方がyesが多そう(担当者を覚えている)→当たり

ロジスティック回帰で各説明変数を見る

## ロジスティック回帰
## 個人に紐づく、架電前に得られる説明変数のみ利用
lr<-glm(y~age+job+marital+default+education+housing+
          loan+contact+day_of_week+pdays+poutcome+previous,
        data=bank_marketing_train, family="binomial")


## step関数でAICを減らす
lr2 <- step(lr)
## Start:  AIC=20763.61
## y ~ age + job + marital + default + education + housing + loan + 
##     contact + day_of_week + pdays + poutcome + previous
## 
##               Df Deviance   AIC
## - housing      1    20690 20762
## <none>              20690 20764
## - loan         1    20693 20765
## - education    7    20712 20772
## - day_of_week  4    20707 20773
## - age          1    20702 20774
## - poutcome     2    20704 20774
## - previous     1    20706 20778
## - marital      3    20712 20780
## - pdays        1    20732 20804
## - default      2    20841 20911
## - job         11    20871 20923
## - contact      1    21013 21085
## 
## Step:  AIC=20762.21
## y ~ age + job + marital + default + education + loan + contact + 
##     day_of_week + pdays + poutcome + previous
## 
##               Df Deviance   AIC
## <none>              20690 20762
## - loan         2    20694 20762
## - education    7    20713 20771
## - day_of_week  4    20708 20772
## - age          1    20703 20773
## - poutcome     2    20705 20773
## - previous     1    20706 20776
## - marital      3    20712 20778
## - pdays        1    20733 20803
## - default      2    20841 20909
## - job         11    20872 20922
## - contact      1    21013 21083
AIC(lr2)
## [1] 20762.21
summary(lr2)
## 
## Call:
## glm(formula = y ~ age + job + marital + default + education + 
##     loan + contact + day_of_week + pdays + poutcome + previous, 
##     family = "binomial", data = bank_marketing_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0639  -0.5021  -0.3837  -0.3010   2.8803  
## 
## Coefficients:
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                  -1.063603   0.292051  -3.642 0.000271 ***
## age                           0.007940   0.002243   3.541 0.000399 ***
## jobblue-collar               -0.274764   0.072316  -3.800 0.000145 ***
## jobentrepreneur              -0.199841   0.112115  -1.782 0.074673 .  
## jobhousemaid                 -0.120341   0.131644  -0.914 0.360642    
## jobmanagement                -0.094809   0.078278  -1.211 0.225825    
## jobretired                    0.612476   0.095242   6.431 1.27e-10 ***
## jobself-employed             -0.156807   0.110639  -1.417 0.156401    
## jobservices                  -0.209717   0.079214  -2.647 0.008109 ** 
## jobstudent                    0.788676   0.104768   7.528 5.16e-14 ***
## jobtechnician                -0.143732   0.064162  -2.240 0.025082 *  
## jobunemployed                 0.148129   0.117128   1.265 0.205986    
## jobunknown                   -0.178919   0.227348  -0.787 0.431292    
## maritalmarried                0.113629   0.062912   1.806 0.070893 .  
## maritalsingle                 0.298159   0.071041   4.197 2.70e-05 ***
## maritalunknown                0.126908   0.392171   0.324 0.746239    
## defaultunknown               -0.701074   0.060878 -11.516  < 2e-16 ***
## defaultyes                   -8.610256  84.476695  -0.102 0.918817    
## educationbasic.6y            -0.018758   0.111917  -0.168 0.866895    
## educationbasic.9y            -0.133104   0.086594  -1.537 0.124265    
## educationhigh.school         -0.062314   0.084078  -0.741 0.458606    
## educationilliterate           1.003878   0.635961   1.579 0.114446    
## educationprofessional.course -0.003529   0.092894  -0.038 0.969695    
## educationuniversity.degree    0.067691   0.083845   0.807 0.419475    
## educationunknown              0.268525   0.108378   2.478 0.013224 *  
## loanunknown                   0.041182   0.121540   0.339 0.734733    
## loanyes                      -0.101550   0.052615  -1.930 0.053602 .  
## contacttelephone             -0.805788   0.047171 -17.082  < 2e-16 ***
## day_of_weekmon               -0.141551   0.060938  -2.323 0.020186 *  
## day_of_weekthu                0.059907   0.058731   1.020 0.307711    
## day_of_weektue                0.054050   0.059707   0.905 0.365336    
## day_of_weekwed                0.066579   0.059795   1.113 0.265510    
## pdays                        -0.001463   0.000220  -6.649 2.95e-11 ***
## poutcomenonexistent           0.167511   0.092493   1.811 0.070130 .  
## poutcomesuccess               0.800045   0.214754   3.725 0.000195 ***
## previous                      0.245300   0.062066   3.952 7.74e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 23735  on 33743  degrees of freedom
## Residual deviance: 20690  on 33708  degrees of freedom
## AIC: 20762
## 
## Number of Fisher Scoring iterations: 9
# => 1回目の分析で特徴が出ていたもののうち、age, job, contact, previousは影響ありそう

ペルソナの推定

  • Age:60以上
  • Job:retired
  • つまり定年退職者

Job:retiredのペルソナを基本に、データを絞ってペルソナを限定していく

# => Job:retiredの条件で、データを絞ってみてみる
bank_marketing_train_job_retired <- bank_marketing_train[bank_marketing_train$job == "retired",]

# y=yes/noのデータを抽出
bank_marketing_train_job_retired_y <- bank_marketing_train_job_retired[bank_marketing_train_job_retired$y=="yes",]
bank_marketing_train_job_retired_n <- bank_marketing_train_job_retired[bank_marketing_train_job_retired$y=="no",]
summary(bank_marketing_train_job_retired_y)
##       age                  job          marital   
##  Min.   :33.00   retired     :358   divorced: 78  
##  1st Qu.:60.00   admin.      :  0   married :270  
##  Median :68.00   blue-collar :  0   single  : 10  
##  Mean   :68.37   entrepreneur:  0   unknown :  0  
##  3rd Qu.:76.00   housemaid   :  0                 
##  Max.   :98.00   management  :  0                 
##                  (Other)     :  0                 
##                education      default       housing         loan    
##  basic.4y           :148   no     :330   no     :149   no     :297  
##  university.degree  : 60   unknown: 28   unknown:  9   unknown:  9  
##  high.school        : 54   yes    :  0   yes    :200   yes    : 52  
##  professional.course: 45                                            
##  unknown            : 28                                            
##  basic.9y           : 14                                            
##  (Other)            :  9                                            
##       contact    day_of_week    duration         campaign    
##  cellular :319   fri:64      Min.   :  63.0   Min.   : 1.00  
##  telephone: 39   mon:61      1st Qu.: 188.2   1st Qu.: 1.00  
##                  thu:62      Median : 311.0   Median : 1.00  
##                  tue:88      Mean   : 409.2   Mean   : 1.95  
##                  wed:83      3rd Qu.: 530.8   3rd Qu.: 2.00  
##                              Max.   :2093.0   Max.   :17.00  
##                                                              
##      pdays           previous             poutcome    emp.var.rate   
##  Min.   :  2.00   Min.   :0.0000   failure    : 51   Min.   :-3.400  
##  1st Qu.:  9.25   1st Qu.:0.0000   nonexistent:209   1st Qu.:-2.900  
##  Median :999.00   Median :0.0000   success    : 98   Median :-1.800  
##  Mean   :713.19   Mean   :0.6508                     Mean   :-1.939  
##  3rd Qu.:999.00   3rd Qu.:1.0000                     3rd Qu.:-1.700  
##  Max.   :999.00   Max.   :4.0000                     Max.   : 1.400  
##                                                                      
##  cons.price.idx  cons.conf.idx      euribor3m       nr.employed  
##  Min.   :92.20   Min.   :-50.80   Min.   :0.6340   Min.   :4964  
##  1st Qu.:92.65   1st Qu.:-42.70   1st Qu.:0.7205   1st Qu.:5009  
##  Median :93.08   Median :-37.50   Median :0.8760   Median :5018  
##  Mean   :93.24   Mean   :-37.58   Mean   :1.3329   Mean   :5052  
##  3rd Qu.:93.99   3rd Qu.:-31.40   3rd Qu.:1.3650   3rd Qu.:5099  
##  Max.   :94.77   Max.   :-26.90   Max.   :4.9680   Max.   :5228  
##                                                                  
##    y      
##  no :  0  
##  yes:358  
##           
##           
##           
##           
## 
summary(bank_marketing_train_job_retired_n)
##       age                  job           marital   
##  Min.   :23.00   retired     :1074   divorced:211  
##  1st Qu.:55.00   admin.      :   0   married :791  
##  Median :58.00   blue-collar :   0   single  : 68  
##  Mean   :60.08   entrepreneur:   0   unknown :  4  
##  3rd Qu.:63.75   housemaid   :   0                 
##  Max.   :95.00   management  :   0                 
##                  (Other)     :   0                 
##                education      default       housing         loan    
##  basic.4y           :339   no     :785   no     :496   no     :888  
##  university.degree  :182   unknown:289   unknown: 29   unknown: 29  
##  high.school        :181   yes    :  0   yes    :549   yes    :157  
##  professional.course:155                                            
##  basic.9y           :107                                            
##  basic.6y           : 58                                            
##  (Other)            : 52                                            
##       contact    day_of_week    duration         campaign     
##  cellular :724   fri:212     Min.   :   1.0   Min.   : 1.000  
##  telephone:350   mon:238     1st Qu.:  99.0   1st Qu.: 1.000  
##                  thu:192     Median : 160.0   Median : 2.000  
##                  tue:226     Mean   : 228.6   Mean   : 2.607  
##                  wed:206     3rd Qu.: 282.8   3rd Qu.: 3.000  
##                              Max.   :2055.0   Max.   :42.000  
##                                                               
##      pdays          previous             poutcome    emp.var.rate    
##  Min.   :  1.0   Min.   :0.0000   failure    :134   Min.   :-3.4000  
##  1st Qu.:999.0   1st Qu.:0.0000   nonexistent:902   1st Qu.:-1.8000  
##  Median :999.0   Median :0.0000   success    : 38   Median : 1.1000  
##  Mean   :957.4   Mean   :0.2086                     Mean   :-0.2944  
##  3rd Qu.:999.0   3rd Qu.:0.0000                     3rd Qu.: 1.4000  
##  Max.   :999.0   Max.   :4.0000                     Max.   : 1.4000  
##                                                                      
##  cons.price.idx  cons.conf.idx     euribor3m      nr.employed     y       
##  Min.   :92.20   Min.   :-50.8   Min.   :0.635   Min.   :4964   no :1074  
##  1st Qu.:92.96   1st Qu.:-42.7   1st Qu.:0.993   1st Qu.:5076   yes:   0  
##  Median :93.44   Median :-38.3   Median :4.856   Median :5191             
##  Mean   :93.48   Mean   :-38.9   Mean   :3.237   Mean   :5146             
##  3rd Qu.:93.99   3rd Qu.:-36.1   3rd Qu.:4.961   3rd Qu.:5228             
##  Max.   :94.77   Max.   :-26.9   Max.   :4.970   Max.   :5228             
## 
# データ数
num_retired_yes = dim(bank_marketing_train_job_retired_y)[1]
num_retired_no = dim(bank_marketing_train_job_retired_n)[1]

ヒストグラム

# 年齢
pl_yes <- plot_ly(x = bank_marketing_train_job_retired_y$age, type="histogram", name = "yes")
pl_no <- plot_ly(x = bank_marketing_train_job_retired_n$age, type="histogram", name = "no")
subplot(pl_yes, pl_no)
# => yesの方が、60以上が多い

# 婚姻状況
pl_yes <- plot_ly(x = bank_marketing_train_job_retired_y$marital, type="histogram", name = "yes")
pl_no <- plot_ly(x = bank_marketing_train_job_retired_n$marital, type="histogram", name = "no")
subplot(pl_yes, pl_no)
# 割合をみてみる
summary(bank_marketing_train_job_retired_y$marital)/num_retired_yes
##   divorced    married     single    unknown 
## 0.21787709 0.75418994 0.02793296 0.00000000
summary(bank_marketing_train_job_retired_n$marital)/num_retired_no
##    divorced     married      single     unknown 
## 0.196461825 0.736499069 0.063314711 0.003724395
# => yesはsingleが少ない

# クレジットの支払遅延
pl_yes <- plot_ly(x = bank_marketing_train_job_retired_y$default, type="histogram", name = "yes")
pl_no <- plot_ly(x = bank_marketing_train_job_retired_n$default, type="histogram", name = "no")
subplot(pl_yes, pl_no)
# 割合をみてみる
summary(bank_marketing_train_job_retired_y$default)/num_retired_yes
##         no    unknown        yes 
## 0.92178771 0.07821229 0.00000000
summary(bank_marketing_train_job_retired_n$default)/num_retired_no
##        no   unknown       yes 
## 0.7309125 0.2690875 0.0000000
# => yesはunknownが少なく、9割が"no"

# 最終学歴
#plot_ly(x = bank_marketing_train_job_retired$education, type="histogram", color = bank_marketing_train_job_retired$y)
#plot_ly(x = bank_marketing_train_job_retired$education, type="box", color = bank_marketing_train_job_retired$y)
pl_yes <- plot_ly(x = bank_marketing_train_job_retired_y$education, type="histogram", name = "yes")
pl_no <- plot_ly(x = bank_marketing_train_job_retired_n$education, type="histogram", name = "no")
subplot(pl_yes, pl_no)
# 割合をみてみる
summary(bank_marketing_train_job_retired_y$education)/num_retired_yes
##            basic.4y            basic.6y            basic.9y 
##         0.413407821         0.019553073         0.039106145 
##         high.school          illiterate professional.course 
##         0.150837989         0.005586592         0.125698324 
##   university.degree             unknown 
##         0.167597765         0.078212291
summary(bank_marketing_train_job_retired_n$education)/num_retired_no
##            basic.4y            basic.6y            basic.9y 
##        0.3156424581        0.0540037244        0.0996275605 
##         high.school          illiterate professional.course 
##        0.1685288641        0.0009310987        0.1443202980 
##   university.degree             unknown 
##        0.1694599628        0.0474860335
# => yesはbasic.4y, illiterate(学歴が高くない)が多い 

# 不動産ローンの有無
pl_yes <- plot_ly(x = bank_marketing_train_job_retired_y$housing, type="histogram", name = "yes")
pl_no <- plot_ly(x = bank_marketing_train_job_retired_n$housing, type="histogram", name = "no")
subplot(pl_yes, pl_no)
# 割合をみてみる
summary(bank_marketing_train_job_retired_y$housing)/num_retired_yes
##         no    unknown        yes 
## 0.41620112 0.02513966 0.55865922
summary(bank_marketing_train_job_retired_n$housing)/num_retired_no
##         no    unknown        yes 
## 0.46182495 0.02700186 0.51117318
# => 大きな差はない(yesは少しローン有が多い)

# 個人ローンの有無
pl_yes <- plot_ly(x = bank_marketing_train_job_retired_y$loan, type="histogram", name = "yes")
pl_no <- plot_ly(x = bank_marketing_train_job_retired_n$loan, type="histogram", name = "no")
subplot(pl_yes, pl_no)
# 割合をみてみる
summary(bank_marketing_train_job_retired_y$loan)/num_retired_yes
##         no    unknown        yes 
## 0.82960894 0.02513966 0.14525140
summary(bank_marketing_train_job_retired_n$loan)/num_retired_no
##         no    unknown        yes 
## 0.82681564 0.02700186 0.14618250
# => 差はなさそう

# 連絡デバイス
pl_yes <- plot_ly(x = bank_marketing_train_job_retired_y$contact, type="histogram", name = "yes")
pl_no <- plot_ly(x = bank_marketing_train_job_retired_n$contact, type="histogram", name = "no")
subplot(pl_yes, pl_no)
# 割合をみてみる
summary(bank_marketing_train_job_retired_y$contact)/num_retired_yes
##  cellular telephone 
## 0.8910615 0.1089385
summary(bank_marketing_train_job_retired_n$contact)/num_retired_no
##  cellular telephone 
## 0.6741155 0.3258845
# => yesはcellularが多い

# 前回の接触からの経過日数
#plot_ly(x = bank_marketing_train_job_retired$pdays, type="histogram", color = bank_marketing_train_job_retired$y)
pl_yes <- plot_ly(x = bank_marketing_train_job_retired_y$pdays, type="histogram", name = "yes")
pl_no <- plot_ly(x = bank_marketing_train_job_retired_n$pdays, type="histogram", name = "no")
subplot(pl_yes, pl_no)
# 割合をみてみる
summary(bank_marketing_train_job_retired_y$pdays)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00    9.25  999.00  713.19  999.00  999.00
summary(bank_marketing_train_job_retired_n$pdays)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0   999.0   999.0   957.4   999.0   999.0
# => yesは日数が短い人の割合が大きい

# 以前のキャンペーン結果
#plot_ly(x = bank_marketing_train_job_retired$poutcome, type="histogram", color = bank_marketing_train_job_retired$y)
pl_yes <- plot_ly(x = bank_marketing_train_job_retired_y$poutcome, type="histogram", name = "yes")
pl_no <- plot_ly(x = bank_marketing_train_job_retired_n$poutcome, type="histogram", name = "no")
subplot(pl_yes, pl_no)
# 割合をみてみる
summary(bank_marketing_train_job_retired_y$poutcome)/num_retired_yes
##     failure nonexistent     success 
##   0.1424581   0.5837989   0.2737430
summary(bank_marketing_train_job_retired_n$poutcome)/num_retired_no
##     failure nonexistent     success 
##  0.12476723  0.83985102  0.03538175
# => yesはfailure, successが多い

# 以前のキャンペーンの接触回数
pl_yes <- plot_ly(x = bank_marketing_train_job_retired_y$previous, type="histogram", name = "yes")
pl_no <- plot_ly(x = bank_marketing_train_job_retired_n$previous, type="histogram", name = "no")
subplot(pl_yes, pl_no)
# 割合をみてみる
summary(bank_marketing_train_job_retired_y$previous)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.6508  1.0000  4.0000
summary(bank_marketing_train_job_retired_n$previous)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.2086  0.0000  4.0000
# => yesは平均値が大きい

最終的なペルソナ

2.予測モデルを用いたアタックリストを作成する

lr3<-glm(y~.-day_of_week-duration-campaign,
        data=bank_marketing_train, family="binomial")

summary(lr3)
## 
## Call:
## glm(formula = y ~ . - day_of_week - duration - campaign, family = "binomial", 
##     data = bank_marketing_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9013  -0.4129  -0.3240  -0.2778   2.9504  
## 
## Coefficients: (1 not defined because of singularities)
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                  -9.508e+01  1.600e+01  -5.944 2.79e-09 ***
## age                           1.383e-03  2.318e-03   0.597  0.55060    
## jobblue-collar               -1.963e-01  7.484e-02  -2.623  0.00871 ** 
## jobentrepreneur              -8.459e-02  1.153e-01  -0.734  0.46322    
## jobhousemaid                 -1.070e-01  1.389e-01  -0.770  0.44109    
## jobmanagement                -7.113e-02  8.211e-02  -0.866  0.38631    
## jobretired                    2.807e-01  1.028e-01   2.731  0.00632 ** 
## jobself-employed             -1.509e-01  1.146e-01  -1.317  0.18772    
## jobservices                  -1.568e-01  8.150e-02  -1.925  0.05428 .  
## jobstudent                    2.415e-01  1.088e-01   2.219  0.02649 *  
## jobtechnician                -2.015e-02  6.753e-02  -0.298  0.76540    
## jobunemployed                -5.372e-02  1.223e-01  -0.439  0.66040    
## jobunknown                   -2.881e-01  2.362e-01  -1.219  0.22267    
## maritalmarried                5.616e-02  6.552e-02   0.857  0.39137    
## maritalsingle                 1.196e-01  7.447e-02   1.605  0.10840    
## maritalunknown                6.134e-02  4.103e-01   0.150  0.88115    
## educationbasic.6y             4.597e-02  1.151e-01   0.399  0.68959    
## educationbasic.9y            -5.598e-02  8.954e-02  -0.625  0.53181    
## educationhigh.school         -1.073e-02  8.735e-02  -0.123  0.90226    
## educationilliterate           1.192e+00  6.560e-01   1.817  0.06914 .  
## educationprofessional.course  4.639e-02  9.657e-02   0.480  0.63096    
## educationuniversity.degree    1.052e-01  8.729e-02   1.206  0.22791    
## educationunknown              2.121e-01  1.133e-01   1.872  0.06122 .  
## defaultunknown               -3.176e-01  6.312e-02  -5.032 4.86e-07 ***
## defaultyes                   -7.644e+00  8.447e+01  -0.090  0.92790    
## housingunknown               -3.040e-02  1.279e-01  -0.238  0.81208    
## housingyes                   -4.811e-02  3.933e-02  -1.223  0.22125    
## loanunknown                          NA         NA      NA       NA    
## loanyes                      -8.754e-02  5.471e-02  -1.600  0.10960    
## contacttelephone             -8.601e-01  6.075e-02 -14.156  < 2e-16 ***
## pdays                        -1.176e-03  2.207e-04  -5.326 1.00e-07 ***
## previous                     -5.077e-02  6.156e-02  -0.825  0.40954    
## poutcomenonexistent           5.005e-01  9.467e-02   5.287 1.24e-07 ***
## poutcomesuccess               6.782e-01  2.158e-01   3.143  0.00167 ** 
## emp.var.rate                 -7.105e-01  6.552e-02 -10.844  < 2e-16 ***
## cons.price.idx                1.099e+00  1.043e-01  10.534  < 2e-16 ***
## cons.conf.idx                 3.879e-02  6.027e-03   6.436 1.23e-10 ***
## euribor3m                     6.074e-02  8.249e-02   0.736  0.46148    
## nr.employed                  -1.506e-03  1.451e-03  -1.038  0.29949    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 23735  on 33743  degrees of freedom
## Residual deviance: 19114  on 33706  degrees of freedom
## AIC: 19190
## 
## Number of Fisher Scoring iterations: 9
## step関数
lr4 <- step(lr3)
## Start:  AIC=19189.68
## y ~ (age + job + marital + education + default + housing + loan + 
##     contact + day_of_week + duration + campaign + pdays + previous + 
##     poutcome + emp.var.rate + cons.price.idx + cons.conf.idx + 
##     euribor3m + nr.employed) - day_of_week - duration - campaign
## 
##                  Df Deviance   AIC
## - marital         3    19117 19187
## - age             1    19114 19188
## - euribor3m       1    19114 19188
## - previous        1    19114 19188
## - nr.employed     1    19115 19189
## - education       7    19127 19189
## - housing         1    19115 19189
## <none>                 19114 19190
## - loan            1    19116 19190
## - job            11    19150 19204
## - default         2    19141 19213
## - pdays           1    19142 19216
## - poutcome        2    19147 19219
## - cons.conf.idx   1    19155 19229
## - cons.price.idx  1    19221 19295
## - emp.var.rate    1    19230 19304
## - contact         1    19327 19401
## 
## Step:  AIC=19186.58
## y ~ age + job + education + default + housing + loan + contact + 
##     pdays + previous + poutcome + emp.var.rate + cons.price.idx + 
##     cons.conf.idx + euribor3m + nr.employed
## 
##                  Df Deviance   AIC
## - age             1    19117 19185
## - euribor3m       1    19117 19185
## - previous        1    19117 19185
## - nr.employed     1    19118 19186
## - housing         1    19118 19186
## - education       7    19130 19186
## <none>                 19117 19187
## - loan            1    19119 19187
## - job            11    19158 19206
## - default         2    19143 19209
## - pdays           1    19145 19213
## - poutcome        2    19150 19216
## - cons.conf.idx   1    19159 19227
## - cons.price.idx  1    19225 19293
## - emp.var.rate    1    19234 19302
## - contact         1    19331 19399
## 
## Step:  AIC=19184.58
## y ~ job + education + default + housing + loan + contact + pdays + 
##     previous + poutcome + emp.var.rate + cons.price.idx + cons.conf.idx + 
##     euribor3m + nr.employed
## 
##                  Df Deviance   AIC
## - euribor3m       1    19117 19183
## - previous        1    19117 19183
## - nr.employed     1    19118 19184
## - housing         1    19118 19184
## - education       7    19130 19184
## <none>                 19117 19185
## - loan            1    19119 19185
## - job            11    19160 19206
## - default         2    19144 19208
## - pdays           1    19145 19211
## - poutcome        2    19150 19214
## - cons.conf.idx   1    19159 19225
## - cons.price.idx  1    19225 19291
## - emp.var.rate    1    19234 19300
## - contact         1    19331 19397
## 
## Step:  AIC=19183.09
## y ~ job + education + default + housing + loan + contact + pdays + 
##     previous + poutcome + emp.var.rate + cons.price.idx + cons.conf.idx + 
##     nr.employed
## 
##                  Df Deviance   AIC
## - nr.employed     1    19118 19182
## - previous        1    19118 19182
## - housing         1    19119 19183
## - education       7    19131 19183
## <none>                 19117 19183
## - loan            1    19120 19184
## - job            11    19160 19204
## - default         2    19145 19207
## - pdays           1    19145 19209
## - poutcome        2    19151 19213
## - cons.conf.idx   1    19227 19291
## - emp.var.rate    1    19248 19312
## - cons.price.idx  1    19256 19320
## - contact         1    19332 19396
## 
## Step:  AIC=19181.66
## y ~ job + education + default + housing + loan + contact + pdays + 
##     previous + poutcome + emp.var.rate + cons.price.idx + cons.conf.idx
## 
##                  Df Deviance   AIC
## - previous        1    19118 19180
## - housing         1    19119 19181
## - education       7    19132 19182
## <none>                 19118 19182
## - loan            1    19120 19182
## - job            11    19161 19203
## - default         2    19145 19205
## - pdays           1    19146 19208
## - poutcome        2    19151 19211
## - cons.conf.idx   1    19253 19315
## - contact         1    19362 19424
## - cons.price.idx  1    19642 19704
## - emp.var.rate    1    20622 20684
## 
## Step:  AIC=19180.23
## y ~ job + education + default + housing + loan + contact + pdays + 
##     poutcome + emp.var.rate + cons.price.idx + cons.conf.idx
## 
##                  Df Deviance   AIC
## - housing         1    19120 19180
## - education       7    19132 19180
## <none>                 19118 19180
## - loan            1    19121 19181
## - job            11    19162 19202
## - default         2    19146 19204
## - pdays           1    19147 19207
## - poutcome        2    19218 19276
## - cons.conf.idx   1    19253 19313
## - contact         1    19363 19423
## - cons.price.idx  1    19667 19727
## - emp.var.rate    1    20653 20713
## 
## Step:  AIC=19179.66
## y ~ job + education + default + loan + contact + pdays + poutcome + 
##     emp.var.rate + cons.price.idx + cons.conf.idx
## 
##                  Df Deviance   AIC
## - loan            2    19123 19179
## - education       7    19133 19179
## <none>                 19120 19180
## - job            11    19163 19201
## - default         2    19147 19203
## - pdays           1    19149 19207
## - poutcome        2    19219 19275
## - cons.conf.idx   1    19256 19314
## - contact         1    19363 19421
## - cons.price.idx  1    19669 19727
## - emp.var.rate    1    20654 20712
## 
## Step:  AIC=19178.48
## y ~ job + education + default + contact + pdays + poutcome + 
##     emp.var.rate + cons.price.idx + cons.conf.idx
## 
##                  Df Deviance   AIC
## - education       7    19136 19178
## <none>                 19123 19179
## - job            11    19166 19200
## - default         2    19150 19202
## - pdays           1    19151 19205
## - poutcome        2    19222 19274
## - cons.conf.idx   1    19259 19313
## - contact         1    19366 19420
## - cons.price.idx  1    19672 19726
## - emp.var.rate    1    20658 20712
## 
## Step:  AIC=19178.22
## y ~ job + default + contact + pdays + poutcome + emp.var.rate + 
##     cons.price.idx + cons.conf.idx
## 
##                  Df Deviance   AIC
## <none>                 19136 19178
## - default         2    19164 19202
## - pdays           1    19165 19205
## - job            11    19191 19211
## - poutcome        2    19237 19275
## - cons.conf.idx   1    19280 19320
## - contact         1    19385 19425
## - cons.price.idx  1    19693 19733
## - emp.var.rate    1    20681 20721
AIC(lr4)
## [1] 19178.22
summary(lr4)
## 
## Call:
## glm(formula = y ~ job + default + contact + pdays + poutcome + 
##     emp.var.rate + cons.price.idx + cons.conf.idx, family = "binomial", 
##     data = bank_marketing_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9100  -0.4162  -0.3238  -0.2797   2.9583  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -1.110e+02  4.650e+00 -23.863  < 2e-16 ***
## jobblue-collar      -2.618e-01  6.097e-02  -4.295 1.75e-05 ***
## jobentrepreneur     -9.485e-02  1.138e-01  -0.834 0.404516    
## jobhousemaid        -1.419e-01  1.321e-01  -1.074 0.282787    
## jobmanagement       -4.861e-02  8.023e-02  -0.606 0.544562    
## jobretired           2.660e-01  8.171e-02   3.255 0.001133 ** 
## jobself-employed    -1.380e-01  1.136e-01  -1.214 0.224663    
## jobservices         -2.120e-01  7.743e-02  -2.738 0.006180 ** 
## jobstudent           2.540e-01  9.971e-02   2.547 0.010855 *  
## jobtechnician       -2.225e-02  6.004e-02  -0.371 0.710965    
## jobunemployed       -9.283e-02  1.206e-01  -0.769 0.441599    
## jobunknown          -2.458e-01  2.320e-01  -1.060 0.289202    
## defaultunknown      -3.172e-01  6.211e-02  -5.106 3.28e-07 ***
## defaultyes          -7.640e+00  8.448e+01  -0.090 0.927943    
## contacttelephone    -8.798e-01  5.733e-02 -15.347  < 2e-16 ***
## pdays               -1.109e-03  2.059e-04  -5.386 7.22e-08 ***
## poutcomenonexistent  5.607e-01  6.150e-02   9.116  < 2e-16 ***
## poutcomesuccess      7.352e-01  2.071e-01   3.551 0.000384 ***
## emp.var.rate        -7.352e-01  1.897e-02 -38.753  < 2e-16 ***
## cons.price.idx       1.190e+00  4.988e-02  23.869  < 2e-16 ***
## cons.conf.idx        4.441e-02  3.720e-03  11.940  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 23735  on 33743  degrees of freedom
## Residual deviance: 19136  on 33723  degrees of freedom
## AIC: 19178
## 
## Number of Fisher Scoring iterations: 9

考察

  • ここで、ageなどの説明変数の重要性が減ってしまうのは、emp.var.rateなどの説明変数の影響が大きいためと思われる
  • ペルソナを定義するのに使用した説明変数と、それ以外で重要な説明変数を用いてモデリングをおこなう方針とする
  • (emp.var.rate, cons.price.idx, cons.conf.idx を追加する)

続きはPythonで行う

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.